New Columns:
Imputation
Note I did not update the rest of the notebook after adding geopandas method in filtering out rows that were outside san franciscos. Because only ~301 rows were changed and they were points that won't impact the other methods and analyze by much or at all. 2/6/2018
import numpy as np
import pandas as pd
import os
import time
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
import cartopy.crs as ccrs
import cartopy.feature as cfeat
from IPython.display import Image
import geocoder
import geopandas as gpd
from geopandas.tools import sjoin
import shapely
from shapely.geometry import Point
shapely.speedups.enable()
## View all columns
pd.set_option('max.columns', 30)
file = os.path.dirname('__filename__')
fp_df = os.path.join(file, 'dataset', 'SFMTA Data 2015 - 2017.zip')
fp_gps = os.path.join(file, 'dataset', 'SFMTA Data GPS 2015 - 2017.zip')
## Dates and time will be converted later
## low memory because ticket number is a mixture of integer and strings
df = pd.read_csv(fp_df, compression = 'zip',parse_dates= False, low_memory=True)
df_gps = pd.read_csv(fp_gps, compression='zip', parse_dates = False, low_memory=True)
## make columns name consistent and easier to use
df.columns = df.columns.str.lower().str.replace(' ', '_')
df_gps.columns = df_gps.columns.str.lower().str.replace(' ', '_')
print('Shape of df: {}'.format(df.shape))
print('Shape of df_gps: {}'.format(df_gps.shape))
df.head(3)
df_gps.head(3)
This part was added at the end so the ordering of the whole notebook is a little werid.
## From https://www.census.gov/geo/maps-data/data/tiger.html
## Contains boundaries for all counties in USA
tmp = gpd.GeoDataFrame.from_file('dataset/shapefile/cb_2016_us_county_500k')
## Obtain only San Francisco boundary
sf = tmp[((tmp.COUNTYFP == '075') & (tmp.STATEFP == '06'))].copy()
sf
## There are 3 other smaller polygons. Part of San Francisco?
sf.plot()
plt.title('Boundary of San Francisco')
plt.show()
## gps that are zero are nan
df_gps.loc[df_gps.hh_gpslat == 0, 'hh_gpslat'] = np.nan
df_gps.loc[df_gps.hh_gpslon == 0, 'hh_gpslon'] = np.nan
## gps outside of bounding box is nan
# Latitude = 37.70, 37.85
outside = (df_gps.hh_gpslat < 37.7) | (df_gps.hh_gpslat > 37.85)
df_gps.loc[outside, 'hh_gpslat'] = np.nan
df_gps.loc[outside, 'hh_gpslon'] = np.nan
print(outside.sum())
# longitude = -122.54, -122.32
outside = (df_gps.hh_gpslon < -122.54) | (df_gps.hh_gpslon > -122.32)
df_gps.loc[outside, 'hh_gpslat'] = np.nan
df_gps.loc[outside, 'hh_gpslon'] = np.nan
print(outside.sum())
print('Total number of nan: {:,}'.format(df_gps.hh_gpslat.isna().sum()))
## Only classify rows with gps data
tmp = df_gps.loc[df_gps.hh_gpslat.notna()].copy()
tmp.reset_index(inplace=True)
## projection is just assumed
gps = gpd.GeoDataFrame(tmp[['index', 'ticket_number']], crs ={'init': 'epsg:4269'},
geometry = [Point(xy) for xy in zip(tmp.hh_gpslon, tmp.hh_gpslat)])
gps.head(2)
## Determine if gps coordinates are inside polygon
inOut = sjoin(gps, sf[['NAME', 'geometry']], how = 'left')
print('Number of rows outside of SF: {}'.format(inOut.NAME.isna().sum()))
base = sf.plot(color = 'white', edgecolor = 'black', figsize=(30,5))
## Blue if inside
inOut[inOut.NAME.notna()].plot(ax=base, marker='o', color='blue',markersize=10)
## Red if outside
inOut[inOut.NAME.isna()].plot(ax=base, marker='o', color='red',markersize=10)
base.set_xlim(-122.54, -122.32)
base.set_ylim(37.70, 37.85)
plt.title('Points Inside vs Outside')
plt.show()
## set index
inOut.set_index('index', inplace=True)
## find points outside of SF
outside = inOut[inOut.NAME.isna()]
outside.head(3)
## Convert all points outside SF to nan
df_gps.iloc[outside.index, df_gps.columns.get_loc('hh_gpslat')] = np.nan
df_gps.iloc[outside.index, df_gps.columns.get_loc('hh_gpslon')] = np.nan
print('Total number of nan after: {:,}'.format(df_gps.hh_gpslat.isna().sum()))
## plot gps points on sf boundaries
def sfplot(pt, markersize=10, imputed=False, limit=False):
## Exlcude rows that are missing gps
pt = pt[pt.hh_gpslat.notna()]
## Extract only the imputed points
if imputed:
pt = pt[pt.imputed]
## Create a geodataframe for plotting
gps = gpd.GeoDataFrame(pt[['ticket_number']], crs ={'init': 'epsg:4269'},
geometry = [Point(xy) for xy in zip(pt.hh_gpslon, pt.hh_gpslat)])
## the base map (SF)
base = sf.plot(color='white', edgecolor='black', figsize=(30,5))
gps.plot(ax=base, marker='o', color='red', markersize=10)
## Focus on SF only
if limit:
plt.xlim(-122.54, -122.32)
plt.ylim(37.70, 37.85)
## show plot
plt.show()
## Show that all points outside sf are removed
sfplot(df_gps)
## wrap up
del tmp, base, inOut, gps, outside
df.ticket_type_code.unique()
This column only contains the letter 'P', so not a useful column
df.drop(columns = ['ticket_type_code'], inplace = True)
## gps data is from hand held computer while the rest are hand written or other special citation
df_gps['handheld'] = True
## Ticket should all be unique
print('All ticket number for df is unique: {}'\
.format(df.ticket_number.unique().shape[0] == df.shape[0]))
print('All ticket number for df_gps is unique: {}'\
.format(df_gps.ticket_number.unique().shape[0] == df_gps.shape[0]))
print('df nan count: {}'.format(df.ticket_number.isna().sum()))
print('df_gps nan count: {}'.format(df_gps.ticket_number.isna().sum()))
tic_num_dup = df_gps.duplicated(subset = 'ticket_number', keep = False)
print('Number of duplicates for gps: {}'.format(tic_num_dup.sum()/2))
df_gps[tic_num_dup]
df[df.ticket_number.isin(['874320112', '863464114'])]
Manually verified on google map:
The locations are not perfectly at the same spot. Off by approx few feets.
## drop index that is duplicate and gps is nan
drop_idx = df_gps[tic_num_dup & df_gps.hh_gpslat.isna()].index.values
print('df_gps index to be dropped: {}'.format(drop_idx))
df_gps.drop(drop_idx, axis = 0, inplace = True)
print('Number of duplicates is zero: {}'\
.format(df_gps.duplicated(subset = 'ticket_number', keep = False).sum() == 0))
del tic_num_dup, drop_idx ## wrap up
From Shawn: If there is no violation code you can disregard those violations. Often times it comes from handwritten citations not ones that start with 8. 8xxxxxx citation numbers are handheld computer generated and should have all the data.
## Determine why df ticket number is different from df_gps's ticket number
print("df's ticket_number dtype is: {}".format(df.ticket_number.dtype))
print("df_gps's ticket_number dytpe is: {}".format(df_gps.ticket_number.dtype))
## The length of df's ticket_number
tic_num_len = df.ticket_number.str.len()
## GPS's ticket are only 9 char long
df_gps.ticket_number.astype('str').str.len().describe()[['min', 'max']].astype('int')
## There is a range of main dataframe's ticket number
print('Number of tickets with length:')
for i in range(6, 12):
print('{}: {}'.format(i, (tic_num_len == i).sum()))
## Observations:
## ticket_number start with 'J'
## badge_# is always nan
## ticket_type_code is always 'P
## tick_street_no is always nan
## issue_year is always 2017
df[tic_num_len == 6].head(2)
## Observations:
## ~half badge_# is missing
## 179 tick_street_no is not null
## Inferred from 50 rows:
## ticket_number either start with 'H'/'D' or end with 'G'
## ticket_number with 'G' has violation that looks like GO1.I.6A
## violation with 'G' are related to parking lots??
# df[tic_num_len == 7].sample(n = 50, random_state = 142)
df[tic_num_len == 7].head(2)
df.loc[tic_num_len == 7, 'issue_year'].value_counts()
## I think 002613GX has an extra X
## Because there are only 3 other ticket_number
## with the letter X
df[tic_num_len == 8]
## ticket_number with X is a mistake
## The 2 other is length 11
df[df.ticket_number.str.contains('x', case = False)]
## Could not infer a pattern from 50 rows
## Example: 1003223701, PD27331441, 000220869U
df[tic_num_len == 10].head(2)
df.loc[tic_num_len == 10, 'issue_year'].value_counts()
It looks like ticket_number with length 6, 7, or 10 is from different type of officers.
# ## find all those special ticket numbers
# special_tick = df.ticket_number[tic_num_len == 10].str.match('[^A-Z]+', na = False)
# print('Number: {}'.format(special_tick.sum()))
# df.loc[tic_num_len == 10, 'ticket_number'].str.match('[^A-Z]+', na = False)
# df[tic_num_len == 10][special_tick]['issue_year'].value_counts()
## remove x from ticket_number
df.ticket_number.replace('X', '', regex = True, inplace = True)
print('Ticket contain X: {}'.format(df.ticket_number.str.contains('X').any()))
del tic_num_len ## wrap up
## Correcting ticket number introduce duplicate ticket numbers
print('N dupluciates? {}'\
.format(df.duplicated(subset='ticket_number', keep=False).sum()))
## Only one ticket 1004255232 doesn't look like an actual duplicate
df[df.duplicated(subset='ticket_number', keep=False)]
## Only 1 is not an actual duplicate but will just remove it
print('Before: {:,}'.format(df.shape[0]))
df.drop_duplicates(subset='ticket_number', keep='first', inplace=True)
print('After: {:,}'.format(df.shape[0]))
issue_mnth_nbr, issue_day_of_month, issue_year, issue_mnth, issue_day, issue_day, tick_issue_hour might have been derived from tick_issue_date + tick_issue_time. This makes the information reduntant and wasting space. These features can be made on demand later.
## Are there any nan
print('Number of nan: {}'.format(df.tick_issue_date.isna().sum()))
## Is issue_date consistently 19 char long
print(df.tick_issue_date.str.len().describe()[['min', 'max']])
## Split the date and time component out
issue_datetime = df.tick_issue_date.str.split(' ', expand = True)
issue_datetime.head(2)
## Determine if the time component is just 00:00:00
## This means that the time part is just a place holder
issue_datetime[1].unique()
## Split date into year, month, day then convert to integer
issue_date = issue_datetime[0].str.split('/', expand = True)
issue_date.columns = ['year', 'month', 'day']
issue_date = issue_date.apply(pd.to_numeric, axis = 0) ## convert to integer
issue_date.head(2)
## Determine if year, month, and day are the same as the
print("Issue Date's Year == issue_year: {}"\
.format((issue_date.year == df.issue_year).all()))
print("Issue Date's Month == issue_mnth_nbr: {}"\
.format((issue_date.month == df.issue_mnth_nbr).all()))
print("Issue Date's day == issue_day_of_month: {}"\
.format((issue_date.day == df.issue_day_of_month).all()))
Since year, month and days are the same so I can infer that issue_mnth and issue_day are the same as well.
## Is df_gps time component meaningless?
## Yup, it is just zero
df_gps.tick_issue_date.str.split(' ', expand = True)[1].unique()
## Is df's tick_issue_hour redundant
print('tick_issue_hour nan: {}'.format(df.tick_issue_hour.isna().sum()))
print('tick_issue_time nan: {}'.format(df.tick_issue_time.isna().sum()))
print('Same idx nan: {}'.format(((df.tick_issue_hour.isna() == df.tick_issue_time.isna()).all())))
## Extract hour from time. Convert to float
hour = pd.to_numeric(df.tick_issue_time.str.split(':', expand = True)[0])
hour_not_nan = ~df.tick_issue_time.isna()
print('All match: {}'\
.format((df.tick_issue_hour[hour_not_nan] == hour[hour_not_nan]).all()))
Yes, tick_issue_hour contains the same information as tick_issue_time
## Look into distribution of time
hour = pd.to_numeric(df.tick_issue_time.replace(':', '', regex = True), errors='coerce')
print('Number of NaN should be 3629: {}'.format(hour.isna().sum()))
hour.plot.hist(bins = 20)
plt.show()
del hour_not_nan, issue_datetime, hour, issue_date ## wrap up
df.drop(['issue_mnth_nbr', 'issue_day_of_month',
'issue_year', 'issue_mnth',
'issue_day', 'tick_issue_hour',],
axis = 1, inplace = True)
## Remove 00:00:00 from date columns
df.tick_issue_date.replace(' 00:00:00', '', regex = True, inplace = True)
df_gps.tick_issue_date.replace(' 00:00:00', '', regex = True, inplace = True)
df.head(1)
df_gps.head(1)
## Need both columns to be string to merge correctly
df_gps.ticket_number = df_gps.ticket_number.astype('str')
df_merge = df.merge(df_gps, how = 'outer', on = 'ticket_number', indicator = True)
df_merge.shape
print('Number of rows with')
for i in ['left_only', 'right_only', 'both']:
print('{}: {}'.format(i, (df_merge._merge == i).sum()))
## Looks like all gps rows match with df's rows
print('Rows difference between df and df_gps: {}'.format(df.shape[0] - df_gps.shape[0]))
## non-handheld tickets should be false
df_merge.handheld.fillna(value = False, inplace = True)
df_merge.head(3)
## Only interested in complete merge
both_idx = df_merge._merge == 'both'
print('All tick_issue_date match: {}'\
.format((df_merge.loc[both_idx, 'tick_issue_date_x'] \
== df_merge.loc[both_idx, 'tick_issue_date_y']).all()))
df_merge.drop(columns=['tick_issue_date_y'], inplace = True)
df_merge.shape
print('Missing time are all from handwritten tickets: {}'\
.format(df_merge.loc[df_merge.tick_issue_time.isna(), 'handheld'].sum() == 0))
## Look into distribution of time
hour = pd.to_numeric(df_merge.tick_issue_time.replace(':', '', regex = True), errors='coerce')
hour[~df_merge.handheld].plot.hist(bins = 20)
plt.xlabel('24 Hour Format')
plt.title('Handwritten Tickets Time distribution')
plt.show()
## Extract only tickets that are handwritten
tmp = df_merge.loc[~df_merge.handheld, ['violation_desc_long']].copy()
tmp['hour'] = hour[~df_merge.handheld]
f, (ax1, ax2) = plt.subplots(1, 2, sharey = True, figsize = (12, 4))
tmp.loc[tmp.violation_desc_long == 'REGISTRATION TABS', 'hour']\
.plot(ax = ax1, kind = 'hist', bins = 20)
ax1.axvline(tmp.loc[tmp.violation_desc_long == 'REGISTRATION TABS', 'hour'].mode()[0], c = 'red')
ax1.set_title('Handwrittent ticket for Registration Tabs')
ax1.set_xlabel('24HR Time')
tmp.loc[tmp.violation_desc_long == 'DOUBLE PARKING', 'hour']\
.plot(ax = ax2, kind = 'hist', bins = 20)
ax2.axvline(tmp.loc[tmp.violation_desc_long == 'DOUBLE PARKING', 'hour'].mode()[0], c = 'red')
ax2.set_title('Handwrittent ticket for Double Parking')
plt.show()
There does look like different violations has different time trend.
## create a lookup table mode time group by violation
hour_impute = tmp.groupby('violation_desc_long')['hour']\
.apply(lambda x: x.mode())\
.unstack()[0].to_frame(name = 'mode_time')\
.reset_index()
hour_impute['mode_time'] = hour_impute.mode_time.astype('int').astype('str').str.pad(4, fillchar='0')
## Need to format time back to original format
hour_impute['hour'] = hour_impute.mode_time.str.slice(start=0, stop=2)
hour_impute['minute'] = hour_impute.mode_time.str.slice(start=2)
hour_impute['time'] = hour_impute.hour.str.cat(hour_impute.minute, sep=':')
hour_impute.head()
to_impute = df_merge.loc[df_merge.tick_issue_time.isna(), ['violation_desc_long']].reset_index()
print(to_impute.shape)
## impute missing with mode lookup table
impute = pd.merge(to_impute, hour_impute, how='left', on='violation_desc_long')
impute.set_index('index', inplace = True)
## FAILURE TO DISPLAY IS NaN so just we 12pm as a value
impute.time.fillna(value='12:00', inplace=True)
print(impute.shape)
print('Number of nan: {}'.format(impute.time.isna().sum()))
impute.head()
# ## Using Xi's way
# tmp = pd.to_datetime(df_merge.tick_issue_date_x, errors='coerce')
# ## extract date information
# df_merge['year'] = tmp.dt.year
# df_merge['month'] = tmp.dt.month
# df_merge['dow'] = tmp.dt.dayofweek
# ## extract just the hour from the time column
# df_merge['hour'] = pd.to_numeric(df_merge.tick_issue_time.str.slice(start=0, stop=2), errors='coerce')
# ## Create lookup table for hour based on year, month, day of week, and violation_desc_long
# ## slightly different from Xi where he used violation_group is a column made later in the
# ## notebook
# tix_time_lookup = df_merge.groupby(['year', 'month', 'dow', 'violation_desc_long'])['hour'].median().reset_index()
# tix_time_lookup.head(3)
# ## extract just the rows that needs be imputed
# to_impute = df_merge.loc[df_merge.tick_issue_time.isna(), ['violation_desc_long']].reset_index()
# print(to_impute.shape)
# ## impute missing with mode lookup table
# impute = pd.merge(to_impute, tix_time_lookup, how='left', on='violation_desc_long')
# print('Number of nan: {}'.format(impute.hour.isna().sum()))
# ## Imputing this way will result with ~1k nan so not good
# ## If I were to implement Xi's way, I will reorder basically my whole notebook,
# ## for ~3k imputation, I don't think it is worth the work and bugs.
## Integrate result back into dataframe
df_merge.iloc[impute.index, df_merge.columns.get_loc('tick_issue_time')] = impute['time']
## wrap up
del hour, tmp, to_impute, impute, f, ax1, ax2, hour_impute
print('There are no missing rows in issue_time: {}'\
.format(df_merge.tick_issue_time.isna().sum() == 0))
df_merge['issue_datetime'] = pd.to_datetime(df_merge['tick_issue_date_x'] \
+ ' ' + df_merge['tick_issue_time'],
format = '%Y/%m/%d %H:%M')
print('Number of nan: {}'.format(df_merge.issue_datetime.isna().sum()))
df_merge.tail().loc[:,['tick_issue_date_x', 'tick_issue_time', 'issue_datetime']]
df_merge.issue_datetime.value_counts().plot()
plt.show()
print('Before: {}'.format(df_merge.shape))
## Drop redundant columns
df_merge.drop(columns = ['tick_issue_date_x', 'tick_issue_time'], inplace = True)
print('After: {}'.format(df_merge.shape))
Conclusion: issue_date looks fine.
print(df.tkt_cnt.value_counts())
print(df_gps.tkt_cnt.value_counts())
Only df_gps has tkt_cnts that are not 1.
## Find idx where tkt_cnt does not match
tkt_cnt_idx = df_merge['tkt_cnt_x'] != df_merge['tkt_cnt_y']
## Look at idx that doesn't have match tkt_cnt
df_merge[both_idx & tkt_cnt_idx].head(3)
## Doesn't look like it is connected to date or time
df_merge.loc[both_idx & tkt_cnt_idx].duplicated(subset = 'issue_datetime', keep = False).sum()
## Doesn't like a specific worker is giving this out
df_merge.loc[both_idx & tkt_cnt_idx, 'badge_#'].value_counts().plot.hist()
plt.title('')
plt.show()
## No conclusion
df_merge.loc[both_idx & tkt_cnt_idx,'tick_corr_type'].value_counts(dropna = False)
I think the count is probably a mistake. This population ony account for ~0.006%. And tkt_cnt is just 1 for the main dataset. I don't believe these two columns are useful.
print(df_merge.shape)
df_merge.drop(columns=['tkt_cnt_x', 'tkt_cnt_y'], inplace = True)
print(df_merge.shape)
del both_idx, tkt_cnt_idx, df_gps ## wrap up
Conclusion: I believe tkt_cnt does not contain any useful information.
# df_merge['badge_#'].value_counts(dropna = False).to_csv('dataset/badge.csv')
## badge_number.csv
## badge number starts from 1 to 434.
## There are unique ones like 1889, 2405, 9154, 9170, 9371, 9373
## There are gaps in between the numbering usually just one
## but once it hits 400 then the gaps are larger (4+)
print('NaN has: {}'.format(df_merge['badge_#'].isna().sum()))
df_merge['badge_#'].value_counts(dropna = True).plot.hist()
plt.title('Number of Tickets Per Badge Number')
plt.show()
## change badge_# name and replace nan with -1
df_merge['badge'] = df_merge['badge_#'].copy()
df_merge.loc[df_merge['badge'].isna(), 'badge'] = -1
df_merge['badge'] = pd.to_numeric(df_merge['badge'], downcast='integer')
df_merge.badge.dtype
print(df_merge.shape)
df_merge.drop(columns = ['badge_#'], inplace = True)
print(df_merge.shape)
Conclusion: The values looks fine, but I am sure what those few special badge number that are different from the rest.
## Only the 50 states plus the captial
states = {"AL":"Alabama","AK":"Alaska","AZ":"Arizona","AR":"Arkansas",
"CA":"California","CO":"Colorado","CT":"Connecticut","DE":"Delaware",
"DC":"Washington DC","FL":"Florida",
"GA":"Georgia","HI":"Hawaii","ID":"Idaho","IL":"Illinois","IN":"Indiana",
"IA":"Iowa","KS":"Kansas","KY":"Kentucky","LA":"Louisiana","ME":"Maine",
"MD":"Maryland","MA":"Massachusetts","MI":"Michigan",
"MN":"Minnesota","MS":"Mississippi","MO":"Missouri","MT":"Montana","NE":"Nebraska",
"NV":"Nevada","NH":"New Hampshire","NJ":"New Jersey","NM":"New Mexico","NY":"New York",
"NC":"North Carolina","ND":"North Dakota","OH":"Ohio",
"OK":"Oklahoma","OR":"Oregon","PA":"Pennsylvania",
"RI":"Rhode Island","SC":"South Carolina","SD":"South Dakota","TN":"Tennessee",
"TX":"Texas","UT":"Utah","VT":"Vermont","VA":"Virginia",
"WA":"Washington","WV":"West Virginia","WI":"Wisconsin","WY":"Wyoming"}
states_df = pd.DataFrame.from_dict(states, orient='index')
states_df.reset_index(inplace = True)
states_df.columns = ['abbr', 'state_full']
states_df.head(3)
df_merge = df_merge.merge(states_df, how = 'left',
left_on = 'plate_state', right_on = 'abbr',
copy = False)
## merge snapshot
df_merge.loc[2:5, ['plate_state', 'abbr', 'state_full']]
## Create new value N/A
## This means state_plate was empty
df_merge.loc[df_merge.plate_state.isin([' ', '|', np.nan]), 'state_full'] = 'N/A'
## Unmatch rows are converted to unknown
## There were values but unable to decode
df_merge.state_full.fillna(value = 'Unknown', inplace = True)
## Output csv file for easy viewing
df_merge.groupby(['plate_state', 'state_full'])['abbr']\
.size().reset_index().to_csv(os.path.join('.','dataset','state_plate.csv'))
## Summary
print('Percentage per new group:')
for i in ['N/A', 'Unknown']:
print('{}: {:.2f}%'.format(i, (df_merge.state_full == i).sum()/df_merge.shape[0]*100))
print(df_merge.shape)
## plate_state is no longer needed
df_merge.drop(columns = ['plate_state', 'abbr'], inplace = True)
print(df_merge.shape)
del states, states_df
Conclusion: Cannot really say much about cleaning this columning because there are only two letters to work with so it hard to guess wheather there was a typo because it could be the 1st or the 2nd letter which opens up to too many possible value. In addition, the actual population where value wasn't able to be decode is a very small proprotion.
df_merge.tick_vin.head(10)
vin_len = df_merge.tick_vin.str.len()
vin_len.value_counts(dropna=False, sort = False)
Length 4 is the standard! So try to convert all to length 4 if possible.
## length 2
df_merge.loc[vin_len == 2, ['tick_vin', 'tick_rp_vin']]
Most of the last two number matches the tick_vin but there are mistakes like 705869 where tick_vin = 84 but the last 4 vin number is actually 8464.
## length 3
df_merge.loc[vin_len == 3, ['tick_vin', 'tick_rp_vin']].head(7)
## values that are length 3 and which one contains alphablets
vin_tmp = df_merge.loc[vin_len == 3, 'tick_vin']
vin_tmp[vin_tmp.str.contains('\D+')].value_counts()
## Make sure that there are no descriptions in length 4 vins
vin_tmp = df_merge.loc[vin_len == 4, 'tick_vin'].str.contains('\D+')
print('Number of tick_vin length 4 with alphabets: {}'.format(vin_tmp.sum()))
df_merge.loc[vin_len == 4, ['tick_vin', 'tick_rp_vin', 'plate']][vin_tmp].head(10)
## Length 4 and only alphabet
vin_tmp = df_merge.loc[vin_len == 4, ['tick_vin', 'tick_rp_vin']]
vin_tmp[vin_tmp.tick_vin.str.isalpha() & (vin_tmp.tick_vin != 'DVWY')].head(10)
Those with alphabet is because they either misread like row 30289 or they just input a random part of the actual vin.
## Something interesting.
## Only badge 381 input the unique value DVWY
## But it is only a fraction of what this person does
print('Total count of DVWY: {}'\
.format(df_merge.tick_vin.str.contains('DVWY').sum()))
print('Who input this: {}'\
.format(df_merge.loc[df_merge.tick_vin == 'DVWY', 'badge'].unique()))
print('Total tickets by 381: {}'.format((df_merge.badge == 381).sum()))
## Length 5
## Looks fine
df_merge.loc[vin_len == 5, ['tick_vin', 'tick_rp_vin']].head(10)
## Length 5 with alphabet
vin_tmp = df_merge.loc[vin_len == 5, 'tick_vin'].str.contains('\D+')
df_merge.loc[vin_len == 5, ['tick_vin', 'tick_rp_vin']][vin_tmp].head(10)
## Length 5 with only alphabet
## There is only one
vin_tmp = df_merge.loc[vin_len == 5, ['tick_vin', 'tick_rp_vin']]
vin_tmp.loc[vin_tmp.tick_vin.str.isalpha(), 'tick_vin'].value_counts()
del vin_tmp, vin_len ## wrap up
Conclusion: tick_vin that was input by human is very unreliable. And there is no way to fix it. The common errors are decription in column like CAN NOT READ or TOO HIGH, random locations of the vin to record (sometime the beginning of the vin which is the same for many cars), random length to record, typo, misread label, or just random input just to fill in the space.
df_merge.tick_rp_vin.isna().sum()
## vin character length
rp_vin_len = df_merge.tick_rp_vin.str.len()
rp_vin_len.value_counts(dropna = False)
(rp_vin_len != 17).sum()/df_merge.shape[0]*100
del rp_vin_len ## wrap up
Conclusion: Have not digged too deeply into this feature.
df_merge.plate.str.len().value_counts(dropna = False)
## Get the length of license plate input
plate_len = df_merge.plate.str.len()
## Count the unique values with length 1
df_merge.loc[plate_len == 1, 'plate'].value_counts().to_frame().transpose()
## Length 6 is also standard for some states.
## California and a few other have length 7
## Length 5 looks fine. inferred from 50 rows
## Some are custom plates
# df_merge[plate_len == 5].head(50)
df_merge.loc[plate_len == 5, 'plate'].sample(n = 10, random_state = 142)
## Length 4 looks fine
## Custom plate probably
# df_merge[plate_len == 4].sample(n = 50, random_state = 142)
df_merge.loc[plate_len == 4, 'plate'].sample(n = 10, random_state = 142)
## Length 3 looks fine
## Custom plate probably
# df_merge[plate_len == 3].sample(n = 50, random_state = 142)
df_merge.loc[plate_len == 3, 'plate'].sample(n = 10, random_state = 142)
## Length 8
## I don't know if they are special plates
## or fake plates like ILVESHOE or HMMRTYME
# df_merge[plate_len == 8].sample(n = 50, random_state = 142)
df_merge.loc[plate_len == 8, 'plate'].sample(n = 10, random_state = 142)
## Length 2
## These look like random inputs. And I don't think license plate
## Can be two letter long.
# df_merge[plate_len == 2].sample(n = 50, random_state = 142)
df_merge.loc[plate_len == 2, 'plate'].sample(n = 10, random_state = 142)
print('Number of nan before: {}'.format(df_merge.plate.isna().sum()))
## Convert length 1 and 2 to N/A or something like that
df_merge.loc[plate_len <= 2, 'plate'] = np.nan
print('Number of nan after: {}'.format(df_merge.plate.isna().sum()))
del plate_len ## wrap up
Conclusion: It is hard to tell whether a plate is valid because states can have different license plate standard and sometime their format are changed later. In addition, custom plates are allowed. Also it is possible that some plates are fake but cannot be easily tell.
df_merge.veh_body.isna().sum()
df_merge.veh_body.value_counts(dropna = False).to_csv('dataset/veh_body.csv')
print(df_merge.veh_body.unique())
Conclusion: I tried decoding the vehicle body style code but did not have luck. There are minor detail between the different table. In addition, I was not able to find what 'SU' stand for which is important because it is top 3 (amount = 378722) in the value_count table. As a result, I have to leave it has is.
print('Unique Values and Number of NaN for each:')
for i in ['violation', 'violation_code', 'violation_desc_long']:
print('{}: Unique: {} NaN: {}'.format(i,
df_merge[i].unique().shape[0],
df_merge[i].isna().sum()))
## Created violation_group_lookup.xlsx from this function
# df_merge.groupby(['violation_code', 'violation', 'violation_desc_long'])\
# .size().reset_index(name = 'counts').to_csv('dataset/violation_group.csv')
## Afterward, manually bin similar violation together like Prohbited Parking == Parking Restricted
## Or Parking Intersection == Block/Intersection == Park/Veh Crossing
## 3/5/2018
## Shawn redid the lookup table such that it better represent SFMTA's view
## had to resaved shawn's excel file because it had to be repaired
violation_lookup = pd.read_excel(os.path.join('.','dataset','violation_group_lookup-sm_1.xlsx'), 0)
print(violation_lookup.shape)
violation_lookup.head(3)
print('Before: {}'.format(df_merge.shape))
## Merge df_merge with violation_lookup
df_merge = df_merge.merge(violation_lookup[['violation_code', 'violation_desc_general', 'violation_group']],
on = 'violation_code', how = 'left')
print('After: {}'.format(df_merge.shape))
df_merge[['ticket_number', 'violation_code', 'violation',
'violation_desc_long', 'violation_desc_general', 'violation_group']]\
.sample(n = 10, random_state = 142)
del violation_lookup ## wrap up
Conclusion: I looked at the description of the violation and combined similar violation together.
For instance:
These column was done after violation, violation code, violation desc long. Not sure what impact that has.
print('Percent of non-null: {:0.2f}%'\
.format(df_merge.tick_corr_type_desc_long.notna().sum()/df_merge.shape[0]*100))
## Count tickets with correction groupby violation_group, violation_desc_long and tick_corr_type_desc_long
## sort by counts in violation_groups.
## To which group has the most correction and if there are obvious trend.
tmp1 = df_merge[df_merge.tick_corr_type.notna()].groupby(['violation_group', 'violation_desc_long', 'tick_corr_type_desc_long'])\
.size().reset_index(name = 'counts')
tmp2 = df_merge[df_merge.tick_corr_type.notna()].groupby('violation_group').size().reset_index(name = 'group_count')
tmp3 = pd.merge(tmp1, tmp2, on = 'violation_group').sort_values(by = ['group_count', 'counts'], ascending = False)
tmp3.to_csv('dataset/tick_corr.csv')
tmp3.head()
del tmp1, tmp2, tmp3 ## wrap up
Conclusion: tick_corr is more of administrative work such as determine if there are other complications to the citations like whether the car was stolen, is tick still owing, the car drove away before the officer was able to place the notice. In some cases, it doesn't haven anything to do with the violation therefore reducing the available information (BLOCK/INTERSECTION --> DRIVE AWAY COPY). And in some case just redunant information (PARKING ON GRADES == 58A CURBED WHLS). Not only that, most description are hard to understand because they are abbreviated (GEN VALID CITE, 802REFNOTF, SRVY CNDCT-VAL)
I will leave this feature as is.
This part was not updated after adding the method to filter out points outside of san francisco
with pd.option_context('display.float_format', '{:0.6f}'.format):
print(df_merge[['hh_gpslat', 'hh_gpslon']].describe())
## Determine if lat and long are missing at the same time
print('GPS are missing together: {}'\
.format((df_merge.hh_gpslat.isna() == df_merge.hh_gpslon.isna()).all()))
print('GPS are value 0 together: {}'\
.format(((df_merge.hh_gpslat == 0) == (df_merge.hh_gpslon == 0)).all()))
print('Nan total: {:,}'.format(df_merge.hh_gpslat.isna().sum()))
# print('Nan before: {}'.format(df_merge.hh_gpslat.isna().sum()))
# ## Convert all gps with value 0 to nan
# df_merge.loc[df_merge.hh_gpslat == 0, 'hh_gpslat'] = np.nan
# df_merge.loc[df_merge.hh_gpslon == 0, 'hh_gpslon'] = np.nan
# print('Nan after: {}'.format(df_merge.hh_gpslat.isna().sum()))
## Keep record for which row was missing gps coordinates
df_merge['gps_missing_record'] = df_merge.hh_gpslat.isna()
## gps_missing is later
df_merge['gps_missing'] = df_merge.hh_gpslat.isna()
## indicator for which rows was imputed
df_merge['imputed'] = False
## Percent missing
print('Percent gps coord missing: {:.2f}%'\
.format(df_merge.hh_gpslat.isna().sum()/df_merge.shape[0]*100))
## Percent missing for handheld issued tickets
print('Percent gps coord missing for handheld: {:0.2f}%'\
.format(df_merge[df_merge.handheld].hh_gpslat.isna().sum()/df_merge[df_merge.handheld].shape[0]*100))
There are some random stray that are in the ocean or one in Salt lake City. However, they are limited so I am not too worry about them. (UPDATE: after using a bounding box and geopandas to determine if points are inside the boundaries. These outliers are removed. As can be seen in above)
## Export df_merge for tableau visualization
## Only columns needed
# df_merge.loc[~df_merge.gps_missing,
# ['ticket_number', 'issue_datetime',
# 'hh_gpslat', 'hh_gpslon', 'tick_street_name']]\
# .to_csv('dataset/tableau/df_merge_min.csv')
## Manually selected selected points that were approximately one block outside of Geary
Image(filename = os.path.join('.','image','geary_all_year_oneblock_filter.png'),
width = 500, height = 100)
geary_out = pd.read_csv(os.path.join('.','dataset','tableau', 'geary_outside_oneblock.csv'))
print('Number of rows outside of Geary: {}'.format(geary_out.shape[0]))
## Out of all tickets with 'Geary' street_name what is the percent bad
print('Percent of ticket population: {:0.3f}%'\
.format(geary_out.shape[0]/
df_merge.tick_street_name.str.contains('GEARY',regex=False).sum()*100))
## Obtain the tickets using index from geary_out
df_merge.iloc[geary_out.F1][['ticket_number','tick_street_no',
'tick_street_name', 'tick_street_suffix',
'hh_gpslat', 'hh_gpslon']]\
.sample(n = 5, random_state = 33)
Correct gps coordinates:
## Manually selected selected points that were approximately one block outside of Mission
Image(filename = os.path.join('.','image','mission_all_year_oneblock.png'),
width = 500, height = 100)
mission_out = pd.read_csv(os.path.join('.','dataset','tableau', 'mission_outside_oneblock.csv'))
print('Number of rows outside of Mission: {}'.format(mission_out.shape[0]))
## Out of all tickets with 'Mission' street_name what is the percent bad
print('Percent of ticket population: {:0.2f}%'\
.format(mission_out.shape[0]/
df_merge.tick_street_name.str.contains('MISSION',regex=False).sum()*100))
## Obtain the tickets using index from mission_out
df_merge.iloc[mission_out.F1][['ticket_number','tick_street_no',
'tick_street_name', 'tick_street_suffix',
'hh_gpslat', 'hh_gpslon']]\
.sample(n = 5, random_state = 33)
Correct GPS:
## Only look for address from handheld tickets because
## handwritten tickets has no gps therefore it will be bias
tmp = df_merge[df_merge.handheld]
## Find the most common address
full_street = tmp.tick_street_no.astype('str')\
.str.cat([tmp.tick_street_name,
tmp.tick_street_suffix], sep = ' ')
## Remove decimal due to float conversion
full_street = full_street.str.replace('\.0', '')
full_street.head()
## Convert to dataframe
tmp = pd.DataFrame(full_street)
## Change name of column
tmp.columns = ['full_street']
## Add gps to determine which row is missing
tmp['gps'] = df_merge[df_merge.handheld].hh_gpslat
## Aggregate based on full_street and return total rows and
## number of nonmissing rows
tmp = tmp.groupby('full_street')['gps'].agg(['count', 'size'])\
.sort_values(by='count', ascending = False)
tmp.columns = ['hasgps', 'total'] ## remove columns
## Calculate percent of not missing gps data
tmp['percent_complete'] = tmp['hasgps']*100//tmp['total']
tmp.to_csv('dataset/full_street_counts.csv')
tmp.head(6)
tmp.percent_complete.plot.hist()
plt.title('Distribution of Complete GPS Data By Address')
plt.xlabel('Completeness (%)')
plt.show()
## If you didn't remove non-handheld tickets, you will
## see something entirely different for 700 4TH ST
## the completeness is 0%
tmp.loc[['700 04TH ST', '700 4TH ST']]
## gps from google maps search
## Looking at some of the top address
## There are two 4th st. Just seeing if that zero affects
## the geoencoding
full_street_gps = pd.DataFrame.from_dict({'full_street': ['400 01ST ST', '300 01ST ST',
'700 04TH ST',
'501 TERRY A FRANCOIS BLVD',
'331 TOWNSEND ST'],
'hh_gpslat': [37.786329, 37.787343,
37.776789,
37.775569, 37.776578],
'hh_gpslon': [-122.393263, -122.394522,
-122.394644,
-122.387663, -122.395667],
'gps_from': 'google'})
## Repeat df by 3 times to include years
full_street_gps = pd.concat([full_street_gps]*3, ignore_index=True)
## Add year from 2015 - 2017
full_street_gps['year'] = pd.Series([2015,2016,2017]).repeat(full_street_gps.shape[0]/3).values
print(full_street_gps.shape)
full_street_gps.head()
## Only data that belongs to handheld
## And extract only columns of interest
tmp_handheld = df_merge.loc[df_merge.handheld, ['ticket_number', 'issue_datetime',
'tick_street_name','hh_gpslat',
'hh_gpslon']].copy()
tmp_handheld['full_street'] = full_street ## Add full_street address
## get a subset. export to tableau. make sure to have year as well.
## Extract tickets with address of interest
tmp_handheld = tmp_handheld[tmp_handheld.full_street.isin(full_street_gps.full_street)]
tmp_handheld.dropna(inplace = True) ## drop missing gps
## extract year from datetime
tmp_handheld['year'] = tmp_handheld.issue_datetime.dt.year
tmp_handheld.drop(columns=['issue_datetime'], inplace = True)
tmp_handheld['gps_from'] = 'ticket'
print(tmp_handheld.shape)
tmp_handheld.head(2)
tmp_handheld.groupby(['year', 'full_street']).size().unstack(level = 0).plot.bar()
plt.ylabel('Number of tickets')
plt.title('Tickets by Year and Address with GPS')
plt.show()
## Export df for tableau
tmp_handheld.append(full_street_gps, ignore_index=True)\
.to_csv('./dataset/tableau/tmp_handheld.csv', index = False)
Image(filename = os.path.join('.','image','1_400_01ST.png'),
width = 400, height = 80)
Image(filename = os.path.join('.','image','2_300_01ST.png'),
width = 400, height = 80)
Image(filename = os.path.join('.','image','3_700_4TH.png'),
width = 400, height = 80)
Image(filename = os.path.join('.','image','4_510_Terry.png'),
width = 400, height = 80)
Image(filename = os.path.join('.','image','5_331_Townsend.png'),
width = 400, height = 80)
Certain location has very high variance while other doesn't like 4TH and Terry. It is unlikely for the gps coordinates to be based on user input because their input are exactly the same. This is based on uncleaned data too.
del geary_out, mission_out, full_street, full_street_gps, tmp_handheld
Conculsion: There is not much I can do from here. Other than try to find the gps points that are outside of San Francisco which is wrong. But it seems like a small population therefore, not gonna worry about it. Imputation will be done with Street Address.
print('Number of nan:')
for i in ['tick_street_no', 'tick_street_name', 'tick_street_suffix']:
print('{} : {}'.format(i, df_merge[i].isna().sum()))
## For string columns, replace nan with nothing because
## nan + string = nan which is bad
df_merge.tick_street_name.fillna(value = '', inplace = True)
df_merge.tick_street_suffix.fillna(value = '', inplace = True)
print('Number of After nan:')
for i in ['tick_street_no', 'tick_street_name', 'tick_street_suffix']:
print('{} : {}'.format(i, df_merge[i].isna().sum()))
## Fixing misspelling
df_merge.tick_street_suffix.replace(to_replace = 'BLD', value = 'BLVD', inplace = True)
df_merge.tick_street_suffix.replace(to_replace = 'RD', value = 'ROAD', inplace = True)
## Make things easier to search
df_merge.tick_street_suffix.replace(to_replace = 'SQ', value = 'SQUARE', inplace = True)
## Only 3 rows and is suppose to be DRIVE
df_merge.tick_street_suffix.replace(to_replace = 'WALK', value = 'DRIVE', inplace = True)
## Has 12 rows but their street names are majority missing
df_merge.tick_street_suffix.replace(to_replace = 'BLOCK', value = '', inplace = True)
## Summary of suffix
df_merge.tick_street_suffix.value_counts(dropna = False)
## Total number of unique streets
print('Number of unique street names: {}'.format(df_merge.tick_street_name.unique().shape[0]))
## https://data.sfgov.org/Geographic-Locations-and-Boundaries/Street-Names/6d9h-4u5v
## Pulled on January 22, 2018, official san francisco street names
off_street_names = pd.read_csv(os.path.join('.', 'dataset', 'Street_Names.csv'))
## Remove unknown streets
off_street_names = off_street_names[~off_street_names.StreetName.str.contains('UNNAMED')]
## Remove streets with TI ending like 6TH AVE TI
off_street_names = off_street_names[~off_street_names.StreetName.str.contains(' TI')]
## Check percentage of inconsistent input
off_street_names.head(2)
## Match tick_street_name with official san francisco street names
## count the matches and divide by total tickets
print('Percent of Correct Street Name Input: {:0.2f}%'\
.format(df_merge.tick_street_name.isin(off_street_names.StreetName).sum()/
df_merge.shape[0] * 100))
## Remove non alphanumerics and then end spaces
df_merge.tick_street_name = df_merge.tick_street_name.str.replace('[^a-zA-Z0-9\s]', '').str.strip()
print('After: {}'.format(df_merge.tick_street_name.unique().shape[0]))
## Replace misspelling
def str_rep(to_replace, replace, regex = False):
df_merge.tick_street_name.replace(to_replace = to_replace,
value = replace, inplace = True,
regex = regex)
## BLK, BLK OF (A lot of streetnames has this in front)
str_rep(['BLK OF ', 'BLOCK OF '], '', regex = True)
str_rep(['BLK ', 'BLOCK ', 'BK '], '', regex = True)
## LAPU LAPU was LAPU-LAPU before - was taken out
str_rep('LAPULAPU', 'LAPU LAPU', regex = True)
## OFARRELL
str_rep(['O FARRELL', 'OFARRELL', "O'FARRELL",
'O"FARRELL', "C O'FARRELL", 'FARRELL',
'OFARREL', 'OFFARRELL'], 'OFARRELL')
## EMBARCADERO
str_rep(['EMBARCADERO NORTH', 'EMBARCADERO SOUTH', 'EMBARCADERO'], 'THE EMBARCADERO')
## PIER 43
str_rep('PIER431', 'PEIR 43')
## BAYSHORE
str_rep(['BAY SHORE'], 'BAYSHORE')
## 01ST
str_rep(['1ST'], '01ST')
## 02ND
str_rep(['2ND'], '02ND')
## 03RD
str_rep(['3RD', '3 RD'], '03RD')
## 04TH
str_rep(['4TH'], '04TH')
## 05TH
str_rep(['5TH'], '05TH')
## 06th
str_rep(['6TH'], '06TH')
## 07Th
str_rep(['7TH'], '07TH')
## 8TH
str_rep(['8TH'], '08TH')
## 09th
str_rep(['9TH'], '09TH')
## 16th
str_rep(['16'], '16TH')
## 25TH
str_rep(['25TH NORTH'], '25TH')
## JOHN F KENNEDY
str_rep(['JFK'], 'JOHN F KENNEDY', regex = True)
## MARTIN LUTHER KING JR
str_rep(['MLK JR', 'MLK DR', 'MLKD', 'MLKF'], 'MARTIN LUTHER KING JR', regex = True)
str_rep('MLK', 'MARTIN LUTHER KING JR', regex = True)
str_rep(['MARTIN LUTHER KING'], 'MARTIN LUTHER KING JR')
## DR CARLTON B GOODLETT
str_rep(['CARLTON B GOODLETT', 'CARLTON B GOODLET',
'CARLTON B. GOODLETT', 'DR CARLTON B. GOODLETT',
'CARLTONN B GOODLETT'], 'DR CARLTON B GOODLETT')
## BUENA VISTA AVE EAST
str_rep('BUENA VISTA AVE EAST', 'BUENA VISTA AVE EAST')
## GREAT HIGHWAY
str_rep(['GREAT HIGHWAY', 'GRT HWY'], 'GREAT HWY', regex = True)
str_rep('GRT', 'GREAT HWY')
str_rep(['LOWER GREAT HIGHWAY', 'LOWER GREAT HWY', 'GRT HYW'], 'GREAT HWY')
str_rep(['GRT HWY 1000', 'GREAT HWY 1000'], '1000 GREAT HWY')
## JUNIPERO SERRA BLVD
str_rep(['JUNIPERO SERRA BLVD OFF', 'JUNIPERO SERRA BLVD OFF R',
'JUNIPERO SERRA BLVD ON RA', 'JUNIPERO',
'JUNIPERO SERRA BLVD ON R'], 'JUNIPERO SERRA BLVD')
## CECILIA
str_rep('CECELIA', 'CECILIA')
## NORTHPOINT
str_rep('NORTHPOINT', 'NORTH POINT')
## SERGEANT JOHN V YOUNG
str_rep('SERGEANT JOHN V YOUNG', 'SGT JOHN V YOUNG')
## SUNVIEW DRIVE
str_rep('SUNVIEW WAY', 'SUNVIEW DRIVE')
## CALIFORNIA
str_rep('CALIFORINA', 'CALIFORNIA')
## AVENUE OF THE PALMS
str_rep('AVE OF THE PALMS', 'AVENUE OF THE PALMS')
## BUCHANAN
str_rep('BUCHANNAN', 'BUCHANAN')
## PALOV
str_rep('PALOV', 'PALOU')
## POTRERO
str_rep('POTERO', 'POTRERO', regex=True)
## LAGUNA HONDA BLVD
str_rep('LAGUNA HONDA BVLD', 'LAGUNA HONDA BLVD')
## CHRISTMAS TREE
str_rep(['CHRISTMAS TREE', 'CHRISTMAS TREE PT'], 'CHRISTMAS TREE POINT')
## WEST VIEW
str_rep('VIEW', 'WEST VIEW')
## FERRY BLDG EMBARCADERO NORTH
str_rep('FERRY BLG EMBARCADERO NORTH', 'FERRY BLDG EMBARCADERO NORTH')
##
str_rep('FERRY BLG EMBARCADERO SOUTH', 'FERRY BLDG EMBARCADERO SOUTH')
## LOT BUS STOP
str_rep(['LOT BUS', 'LOT BUS ONLY',
'LOT BUS RED', 'LOT RED AND BUS ZONE'], 'LOT BUS STOP')
## LINCOLN WAY
str_rep('LINCOLN WY', 'LINCOLN WAY')
## GEARY BLVD
str_rep('GEARY BVLD', 'GEARY BLVD')
str_rep('GERAY', 'GEARY')
## MIDDLEPOINT
str_rep('MIDDLEPOINT', 'MIDDLE POINT', regex = True)
## STOW LAKE
str_rep('STOWLAKE', 'STOW LAKE')
## EL DORADO NORTH
str_rep('EL DORADO NORTH', 'EL DORADO')
## MOSCOW ST
str_rep('MOSCOW CROCKER AMAZON', 'MOSCOW ST')
## MIDDLE WEST DRIVE
str_rep(['MIDDLE DRIVE WEST', 'MIDDLE DR WEST'], 'MIDDLE WEST DRIVE')
## BLYTHDALE
str_rep('BLYTHEDALE', 'BLYTHDALE')
##
str_rep('SAN BURNO', 'SAN BRUNO')
## MARY TERESA
str_rep('MARY THERESA', 'MARY TERESA')
## BUENA VISTA EAST AVE
str_rep('BUENA VISTA AVE EAST', 'BUENA VISTA EAST')
## BUENA VISTA WEST
str_rep('BUENA VISTA AVE WEST', 'BUENA VISTA WEST')
## CONSERVATORY DR EAST
str_rep('CONSERVATORY DR EAST', 'CONSERVATORY EAST DR')
str_rep('CONSERVATORY DR WEST', 'CONSERVATORY WEST DR')
## Mission Bay Blvd North
str_rep(['MISSION BAY BLVD (NORTH)', 'MISSION BAY BLVD NORTH',
'MISSION BAY BLVD NORTH'], 'MISSION BAY NORTH BLVD')
## Mission Bay Blvd South
str_rep(['MISSION BAY BLVD (SOUTH)', 'MISSION BAY BLVD SOUTH'],
'MISSION BAY SOUTH BLVD')
## FERRY BUILDING EMBARCADERO SOUTH
str_rep(['FFERRY BLG EMBARCADERO SOUTH', 'FERRY EMBARCADERO SOUTH',
'FERRY BLD EMBARCADERO SOUTH'], 'FERRY BLDG EMBARCADERO SOUTH')
## FERRY BLDG EMBARCADERO NORTH
str_rep(['FERRY BUILDING', ' FERRY BLD EMBARCADERO NORTH',
'FFERRY BLG EMBARCADERO NORTH', 'FFERRY BLD EMBARCADERO NORTH',
'FERRY BUILDING AND EMBARCADERO'], 'FERRY BLDG EMBARCADERO NORTH')
## YATCH ROAD
str_rep('YACH', 'YACHT')
str_rep(['YACHT RD SAN FRANCISCO', 'YATCH RD MARINA',
'YACHT RD MC', 'YACHT RD AND MARINA',
'YACHT RD MARINA GREEN PLOT WES'], 'YACHT ROAD')
## MCALLISTER
str_rep('MC ALLISTER', 'MCALLISTER')
## UNITED NATIONS
str_rep(['UNITED NATION', 'UN'], 'UNITED NATIONS')
## Fixing Pier specifically
## Stop when count was at 3. So 4 <= corrected
## This leaves ~ 132 rows uncorrected and ~99 unique street names
str_rep(['PIER 1 EMBARCADERO NORTH', 'PIER 01 EMBARCADERO NORTH',
'PIER1 EMBARCADERO NORTH', 'PIER1 1-2 EMBARCADERO NORTH'
'PIER 1 1-2 EMBARCADERO NORTH', 'PIER EMBARCADERO NORTH',
'PIER 1-1', 'PIER 1 EMBARCADERO SOUTH', 'PIER 1'], 'PIER 01')
str_rep(['PIER 02 EMBARCADERO SOUTH', 'PIER 02 EMBARCADERO NORTH'], 'PIER 02')
str_rep(['PIER 03 EMBARCADERO NORTH', 'PIER 03 EMBARCADERO SOUTH',
'PIER3 EMBARCADERO NORTH', 'PIER 3 EMBARCADERO NORTH',
'PIER03 EMBARCADERO NORTH', 'PIER 3'], 'PIER 03')
## PIER 05
str_rep(['PIER 05 EMBARCADERO NORTH', 'PIER 5 EMBARCADERO NORTH',
'PIER5 EMBARCADERO NORTH', 'PIER 0 5'], 'PIER 05')
str_rep(['PIER 07 EMBARCADERO NORTH'], 'PIER 07')
str_rep(['PIER 09 EMBARCADERO NORTH'], 'PIER 09')
str_rep(['PIER 15 EMBARCADERO NORTH'], 'PIER 15')
str_rep(['PIER 17 EMBARCADERO NORTH'], 'PIER 17')
str_rep(['PIER 23 EMBARCADERO NORTH'], 'PIER 23')
str_rep(['PIER 24 EMBARCADERO SOUTH'], 'PIER 24')
str_rep(['PIER 26 EMBARCADERO SOUTH'], 'PIER 26')
str_rep(['PIER 27 EMBARCADERO NORTH'], 'PIER 27')
str_rep(['PIER 28 EMBARCADERO SOUTH', 'PIER 28 EMBARCADERO NORTH',
'PIER 28 EMBARCADERO NORTH'], 'PIER 28')
str_rep(['PIER 30 EMBARCADERO SOUTH', 'PIER30 EMBARCADERO SOUTH'], 'PIER 30')
str_rep(['PIER 31 EMBARCADERO NORTH'], 'PIER 31')
str_rep(['PIER 33 EMBARCADERO NORTH', 'PIER 33 EMBARCADERO',
'PIER 33 EMBARCADERO SOUTH', 'PIER 33 AND EMBARCADERO',
'PIER 33 AND HALF', 'PIER33 EMBARCADERO NORTH'], 'PIER 33')
str_rep(['PIER 35 EMBARCADERO NORTH'], 'PIER 35')
str_rep(['PIER 39 THE EMBARCADERO', 'PIER39 EMBARCADERO NORTH'], 'PIER 39')
str_rep(['PIER41 EMBARCADERO NORTH', 'PIER41 THE EMBARCADERO',
'PIER 41 THE EMBARCADERO', 'PIER 41 EMBARCADERO NORTH'], 'PIER 41')
str_rep(['PIER43 EMBARCADERO NORTH', 'PIER 43 EMBARCADERO NORTH',
'PIER 43 THE EMBARCADERO', 'PIER43 THE EMBARCADERO',
'PIER43 1-2 EMBARCADERO NORTH', 'PIER 431', 'PEIR 43'], 'PIER 43')
str_rep(['PIER 45 THE EMBARCADERO', 'PIER45 THE EMBARCADERO',
'PIER 45 EMBARCADERO NORTH'], 'PIER 45')
str_rep(['PIER 52LOT TERRY A FRANCOIS'], 'PIER 52')
## Manually Input GPS
str_rep(['YACHT RD MARINA GREEN P', 'YACHT RD M',
'YACHT RD MG', 'YACHT RD MARINA GREEN PARKING',
'YACHT HARBOR', 'YACHT RD MARINA GREEN',
'YACHT RD AND MARINA GREEN P', 'YACHT RD MARINA GREEN PLOT EAS',
'YACHT RD AND MARINA GRN', 'YACHT RD MARINA',
'YACHT RD MARINA GREEN PLOT'], 'YACHT RD MARINA')
## for 110 YACHT ROAD
df_merge.loc[df_merge.tick_street_name == 'YACHT RD MARINA', 'hh_gpslat'] = 37.806935
df_merge.loc[df_merge.tick_street_name == 'YACHT RD MARINA', 'hh_gpslon'] = -122.447542
df_merge.loc[df_merge.tick_street_name == 'YACHT RD MARINA', 'imputed'] = True
## Match with the other addresses
str_rep('YACHT RD MARINA', 'YACHT ROAD')
## (#count) ALL nan
### (#66) CATALINA AND FAIRFAX = 37.736284, -122.380847
## FAIRFAX has 320 while CATALINA 76 tickets, but FAIRFAX is 6x longer
## address is 209 Fairfax and 221 Catalina
df_merge.loc[df_merge.tick_street_name == 'CATALINA AND FAIRFAX',
'hh_gpslat'] = 37.736284
df_merge.loc[df_merge.tick_street_name == 'CATALINA AND FAIRFAX',
'hh_gpslon'] = -122.380847
df_merge.loc[df_merge.tick_street_name == 'CATALINA AND FAIRFAX',
'imputed'] = True
df_merge.loc[df_merge.tick_street_name == 'CATALINA AND FAIRFAX',
'tick_street_name'] = '221 CATALINA'
## (#69) JOHN F KENNEDY AND GREAT HWY = 37.769196, -122.510987
## it is 1000 GREAT HWY
df_merge.loc[df_merge.tick_street_name == 'JOHN F KENNEDY AND GREAT HWY',
'hh_gpslat'] = 37.769196
df_merge.loc[df_merge.tick_street_name == 'JOHN F KENNEDY AND GREAT HWY',
'hh_gpslon'] = -122.510987
df_merge.loc[df_merge.tick_street_name == 'JOHN F KENNEDY AND GREAT HWY',
'imputed'] = True
df_merge.loc[df_merge.tick_street_name == 'JOHN F KENNEDY AND GREAT HWY',
'tick_street_name'] = '1000 GREAT HWY'
## (#35) MARTIN LUTHER KING JR AND LINCOLN = 37.764121, -122.508909
df_merge.loc[df_merge.tick_street_name == 'MARTIN LUTHER KING JR AND LINCOLN',
'hh_gpslat'] = 37.764121
df_merge.loc[df_merge.tick_street_name == 'MARTIN LUTHER KING JR AND LINCOLN',
'hh_gpslon'] = -122.508909
df_merge.loc[df_merge.tick_street_name == 'MARTIN LUTHER KING JR AND LINCOLN',
'imputed'] = True
df_merge.loc[df_merge.tick_street_name == 'MARTIN LUTHER KING JR AND LINCOLN',
'tick_street_name'] = '4750 LINCOLN'
## refresh gps_missing
df_merge['gps_missing'] = df_merge.hh_gpslat.isna()
## Adding Suffix to be consistent
## There is only one options for these that is why I can do this
df_merge.loc[df_merge.tick_street_name == 'CROCKER AMAZON', 'tick_street_suffix'] = 'PARK'
## SAHFTER
df_merge.loc[df_merge.tick_street_name == 'SAHFTER', 'tick_street_suffix'] = 'AVE'
## THORTON
df_merge.loc[df_merge.tick_street_name == 'THORTON', 'tick_street_suffix'] = 'AVE'
## LOT BUS STOP
df_merge.loc[df_merge.tick_street_name == 'LOT BUS STOP', 'tick_street_suffix'] = ''
## CATALINA
df_merge.loc[df_merge.tick_street_name == 'CATALINA', 'tick_street_suffix'] = 'ST'
## COLUMBIA
df_merge.loc[df_merge.tick_street_name == 'COLUMBIA', 'tick_street_suffix'] = 'SQUARE'
LOOK FOR STREET NAME THAT DOESN'T MATCH WITH OFFICIAL OR APPROVED STREET NAMES
## Bin unique names together
street_counts = df_merge.tick_street_name.value_counts().reset_index(name = 'counts')
street_counts.columns = ['street', 'counts'] ## rename columns
print('Number of unique streetnames after: {}'.format(street_counts.shape[0]))
## Determine which streets does not exist
## Gather all the names because the naming in the dataset is inconsistent
streetname = off_street_names.StreetName
streetname = streetname.append(off_street_names.FullStreetName, ignore_index = True)
## Our dataset name in this format: Street Direction Suffix
tmp = off_street_names.StreetName + ' ' + off_street_names.PostDirection
## nan + String == nan so got to remove nan
streetname = streetname.append(tmp[tmp.notna()], ignore_index = True)
## manual input of approved streetname values
tmp = ['S', 'BAYSHORE', 'PARKLOT 2450 CALIFORNIA', 'PARKLOT 3252 PIERCE', 'PARKLOT 421 18TH',
'PARKLOT 5732 GEARY', 'PARKLOT 324 8TH', 'PARKLOT 1275 20TH', 'PARKLOT 1325 8TH',
'PARKLOT 807 ULLOA', 'PARKLOT 4116 18TH', 'YACHT ROAD', 'PARKLOT 457 CASTRO',
'PARKLOT 174 WEST PORTAL', 'PARKLOT 330 9TH', 'PARKLOT 20 NORTON', 'PARKLOT 985 POTRERO',
'PARKLOT 4061 24TH', 'PARKLOT 3255 24TH', 'LECH WALESA', 'PARKLOT 25 FELTON',
'PARKLOT 1340 7TH', 'FERRY BLDG EMBARCADERO SOUTH', 'FERRY BLDG EMBARCADERO NORTH',
'SUNVIEW DRIVE', 'FERRY BLG EMBARCADERO NORTH', 'VIEW', 'CROCKER AMAZON', 'SAHFTER',
'SGT JOHN V YOUNG', 'LOT BUS STOP', 'PALACE OF FINE ARTS', 'MARY', 'LOT RED ZONE',
'PARKLOT 2500 OCEAN', 'THORTON', 'AVE B', 'ST FRANCIS AVE', 'ST MARYS AVE',
'ST FRANCIS BLVD', 'ST CHARLES AVE', 'ST ELMO WAY', 'MISSION BAY NORTH BLVD',
'MISSION BAY SOUTH BLVD', 'COLUMBIA', 'LAPU LAPU', 'LAPU-LAPU', '1000 GREAT HWY',
'221 CATALINA', 'PARKLOT 3000 OCEAN', 'UNITED NATIONS']
## Add in the PIERs
tmp = tmp + ['PIER ' + str(i).rjust(2,'0') for i in range(1,53)]
## Then concate all values into streetname
streetname = streetname.append(pd.Series(tmp), ignore_index = True)
print('Total number of recognized street_name: {}'.format(streetname.shape[0]))
## Which street matched with official street naming
match_street = street_counts.street.isin(streetname)
## Manually Look at mismatch streets
print('Total rows not recognized: {}'.format(street_counts[~match_street].counts.sum()))
street_counts[~match_street].to_csv('dataset/unmatched_street.csv')
## Street names that were recognized
street_counts[match_street].to_csv('dataset/match_street.csv')
print('Percent rows recognized: {:0.2f}%'.format(street_counts[match_street].counts.sum()/
df_merge.shape[0] * 100))
Using tableau to fix 3 streets
## Trying to understand streetname with structure: ###ST
tmp = street_counts[~match_street]
nnnst = tmp[tmp.street.str.contains('^[0-9]+ST')]
print(nnnst.shape)
print('Total number of rows before: {}'.format(nnnst.counts.sum()))
## Only care about streetnames > 5
nnnst = nnnst.query('counts > 5').copy()
print(nnnst.shape)
print('Total number of rows after: {}'.format(nnnst.counts.sum()))
## Obtain rows with those streetnames and gps is not missing
df_nnnst = df_merge[(~df_merge.gps_missing) & df_merge.tick_street_name.isin(nnnst.street)]
df_nnnst.shape
df_nnnst.to_csv('dataset/tableau/street_nnnST.csv') ## export for tableau visualization
There wasn't a solid pattern where I could automatically fix them like using a street suffix of street_name range.
## Manually grouped clusters together. They were obvious because they are neatly on the street
Image(filename = os.path.join('.','image','cleaning_nnnst.png'), width = 500, height = 100)
## Import manually grouped streets which was exported from tableau
nnnst_tableau = pd.read_csv(os.path.join('.','dataset','tableau', 'street_nnnST_tableau_export.csv'))
print('Before: {}'.format(nnnst_tableau.shape[0]))
## throw away rows that are other (unclassified)
nnnst_tableau = nnnst_tableau[nnnst_tableau['Ticket Number (group)'] != 'Other']
print('Number of rows to be fixed: {}'.format(nnnst_tableau.shape[0]))
## Replace tick_street_name using index of the tickets
nnnst_idx = nnnst_tableau.F1 ## index of ticket to fix
## index for column name
street_idx = df_merge.columns.get_loc('tick_street_name')
cor_street = nnnst_tableau['Ticket Number (group)'].unique()
## Verify things are working correctly
print(cor_street)
print('Before replacement: {}'\
.format(df_merge.iloc[nnnst_idx, street_idx].isin(cor_street).sum()))
## Replace ticket street name
df_merge.iloc[nnnst_idx, street_idx] = nnnst_tableau['Ticket Number (group)'].values
## Remove any suffix if there are any
df_merge.iloc[nnnst_idx, df_merge.columns.get_loc('tick_street_suffix')] = ''
## Verify replacement is small number as to be fixed rows
print('After replacement: {}'\
.format(df_merge.iloc[nnnst_idx, street_idx].isin(cor_street).sum()))
## wrap up
del nnnst, df_nnnst, nnnst_tableau, nnnst_idx, street_idx, cor_street
del streetname, match_street, street_counts
Random notes:
## Strip white space from street_name and street_suffix
df_merge.tick_street_name = df_merge.tick_street_name.str.strip()
df_merge.tick_street_suffix = df_merge.tick_street_suffix.str.strip()
df_merge.tick_street_no.describe(percentiles=[.75, .95, .99]).astype('int')
df_merge.tick_street_no.plot.box()
plt.ylabel('Street Number')
plt.title('The range of Street Number')
plt.show()
Longest Street/Highest Street Number in San Francisco
## Look for outliers
street_outlier = df_merge.tick_street_no > 8500
print('Number of outlier: {:,}'.format(street_outlier.sum()))
df_merge.tick_street_no[street_outlier].plot.hist()
plt.title('Closer look at outliers greater than 8500')
plt.show()
## Check A random sample and found that street number is
## off by a factor of 10
## 16999.0 BRODERICK 37.784932 -122.441568
## 8511.0 GOLDEN GATE 37.780462 -122.424600
## 29912.0 LAWTON 37.757248 -122.494125
## 69900.0 CALIFORNIA 37.783620 -122.491630
## 34099.0 MISSION 37.742153 -122.422218
df_merge.loc[street_outlier, ['tick_street_no', 'tick_street_name',
'hh_gpslat', 'hh_gpslon']].head()
## Remove a factor of 10
df_merge.loc[street_outlier, 'tick_street_no'] = df_merge.loc[street_outlier, 'tick_street_no']//10
## Refresh outlier list
street_outlier = df_merge.tick_street_no > 8500
print('Possible still outliers: {}'.format(street_outlier.sum()))
## View the last few outliers
## It looks like the street_no are basically all wrong
## 9197.0 ALEMANY 37.720790 -122.439123 is actually 1979
## 9482.0 GEARY 37.780898 -122.465185 is actually 4315
## 8565.0 16TH 37.765213 -122.417205 is actually 2878
## 9828.0 COLLINGWOOD 37.757708 -122.435867 is actually 201
## 9769.0 40TH 37.760795 -122.499570 is actually 1391
df_merge.loc[street_outlier, ['tick_street_no', 'tick_street_name',
'hh_gpslat', 'hh_gpslon']].head()
## I will leave them as is. Since small population and
## no easy way.
Sometime street_name will contain suffix and sometime street_suffix would be nan which is important for some street where there same street_names
## make sure there are no nan
print(df_merge.tick_street_name.isna().sum(),
df_merge.tick_street_suffix.isna().sum())
# ## Summarize tickets by street name and suffix
# tmp = df_merge.groupby(['tick_street_name', 'tick_street_suffix'])\
# .agg({'hh_gpslat':['size', 'count'], 'handheld': 'sum'})
# tmp.columns = ['total', 'n_gps', 'n_handheld']
# tmp.reset_index(inplace = True)
# ## Get the total tickets by street name
# tmp2 = df_merge.groupby('tick_street_name')\
# .size().to_frame(name = 'max').reset_index()\
# .sort_values(by = 'max', ascending = False)
# ## Combine them together. Sort by total ticket by street name
# tmp3 = pd.merge(tmp, tmp2, how = 'left', on = 'tick_street_name')\
# .sort_values(by='max', ascending = False)
# tmp3.to_csv('./dataset/street_name_suffix_count.csv')
# tmp3.head(7)
# del tmp, tmp2, tmp3 ## wrap up
The main discrepancy is due to the difference between written and handheld tickets. Some of them are obvious because there are only one suffix while some has a few so it will take a lot of effort and seemingly little return.
But it is important to note, that if this is not corrected it will affect hand written tickets the most because they have the highest variance and if gps were to be imputed, they will most likely remain missing. However, hand written tickets are a small population (3.5%).
## Combine street name and suffix and then remove white space if there
## and double space
partial_street = df_merge.tick_street_name.str\
.cat(df_merge.tick_street_suffix, sep = ' ')\
.str.strip().str.replace(' ', ' ')
## Fix repeated suffix like ' ST ST' with ' ST'
## Need space in front because 1ST ST might be replaced
fix = [(i*2, i) for i in [' ST', ' AVE', ' BLVD',
' DRIVE', ' TERR', ' COURT',
' HWY', ' PLACE', ' PARK',
' CIR', ' ROAD', ' LANE',
' LOT', ' PLAZA', ' ALLEY',
' SQUARE']]
print('Before fixing')
for d, s in fix:
print('{}: {}'.format(d, partial_street.str.contains(d).sum()))
print('After fixing')
to_fix = [fix[0], fix[1], fix[6], fix[10], fix[12]]
## Only a few needs to be fixed
for d, s in to_fix:
partial_street = partial_street.str.replace(d, s)
print('{}: {}'.format(d, partial_street.str.contains(d).sum()))
## Remove nan and '.0' decimal
tmp = df_merge.tick_street_no.astype('str').str.replace('nan', '')
tmp = tmp.str.replace('\.0', '')
street_full = tmp.str.cat(partial_street, sep = ' ')
street_full.head()
## Add compiled street into df
df_merge['address'] = street_full.str.strip()
df_merge['street_partial'] = partial_street
## wrap up
del street_outlier, partial_street, fix, to_fix, street_full, d, s
Conclusion:
Tick_street_no
Tick_street_name
Tick_street_suffix
def sum_street(x):
d = {}
## total number of tickets
d['total'] = x.shape[0]
## number of tickets with missing coordinates
d['no_gps'] = x.gps_missing.sum()
## number of tickets with gps coordinates
d['has_gps'] = d['total'] - d['no_gps']
## percent complete
d['pct_complete'] = d['has_gps'] * 100//d['total']
## default gps for address
lat = np.nan
lon = np.nan
## bypass if gps is all or none missing
## because cannot or no need to impute
if d['pct_complete'] != 0 and d['pct_complete'] != 100:
lat = x.hh_gpslat.median()
lon = x.hh_gpslon.median()
d['median_lat'] = lat
d['median_lon'] = lon
return pd.Series(d, index = ['total', 'has_gps', 'no_gps',
'pct_complete', 'median_lat',
'median_lon'])
## Create imputation table
address_gps_exact = df_merge.groupby('address').apply(sum_street)
## Calculate cannot be imputed due to zero gps coordinate
print('Number of tickets that cannot be imputed: {:,}'\
.format(int(address_gps_exact.loc[address_gps_exact.pct_complete == 0, 'no_gps'].sum())))
## Extract the ones that cannot be imputed to be manually fixed
address_gps_exact_fail = address_gps_exact[address_gps_exact.pct_complete == 0]
## Keep rows with at least 100
address_gps_exact_fail = address_gps_exact_fail[address_gps_exact_fail.total > 100]
## Sort for easy visual
address_gps_exact_fail.sort_values(by = 'total', ascending = False, inplace = True)
print('N rows: {}'.format(address_gps_exact_fail.shape[0]))
# address_gps_exact_fail.to_csv('dataset/address_gps_exact_fail.csv')
address_gps_exact_fail.head(3)
## Remove rows where gps is na which are the 0% and 100% missing address
address_gps_exact.dropna(subset=['median_lat', 'median_lon'], inplace=True)
print('Number of address before dropping: {:,}'.format(address_gps_exact.shape[0]))
## Keep if count is greater than mininum ticket count or completeness
min_tickets = 30
min_complete = 75
min_count_idx = address_gps_exact.has_gps > min_tickets
min_compt_idx = address_gps_exact.pct_complete > min_complete
print('Number of address after dropping: {:,}'.format((min_count_idx | min_compt_idx).sum()))
print('===============')
print('Number of tickets affected by drop: {:,}'\
.format(int(address_gps_exact.loc[~(min_count_idx | min_compt_idx), 'no_gps'].sum())))
## Drop rows that doesn't meet threshold
address_gps_exact = address_gps_exact[(min_count_idx | min_compt_idx)]
print('Number of tickets captured: {:,}'\
.format(int(address_gps_exact.no_gps.sum())))
## gps from google maps and mapquest
## found out that mapquest and google maps gives different
## gps lat lon for the same address. only slightly different
## google tend to point to buildings while mapquest streets
fix = [('50 DRUMM', (37.795164, -122.396633)),
('50 DRUMM ST', (37.795164, -122.396633)),
('300 TOWNSEND', (37.777225, -122.394845)),
('301 TOWNSEND', (37.776836, -122.395335)),
('310 TOWNSEND', (37.776804, -122.395407)),
('311 TOWNSEND', (37.776665, -122.395533)),
('330 TOWNSEND', (37.776523, -122.395786)),
('331 TOWNSEND', (37.776567, -122.395695)),
('110 YACHT ROAD', (37.806744, -122.447658)),
('99 YACHT ROAD', (37.806924, -122.446578)),
('3950 SCOTT ST', (37.806295, -122.442323)),
('3950 SCOTT', (37.806295, -122.442323)),
## This SF city hall
('DR CARLTON B GOODLETT', (37.779416, -122.418441)),
## Google pointed to the hosiptal while mapquest to the road
('375 LAGUNA HONDA', (37.747911, -122.458602)),
('375 LAGUNA HONDA BLVD', (37.747911, -122.458602)),
('780 MARKET', (37.786022, -122.405583)),
('301 MARKET', (37.792422, -122.397442)),
('1490 MARKET', (37.775373, -122.419027)),
('333 MARKET', (37.792060, -122.397885)),
('450 GOLDEN GATE', (37.781361, -122.418005)),
('1000 VAN NESS', (37.785061, -122.421182)),
('3500 VAN NESS', (37.807386, -122.426409)),
('800 JAMESTOWN', (37.716293, -122.390101)),
('1001 POTRERO', (37.755789, -122.406516)),
('350 GIRARD', (37.727979, -122.404876)),
('200 MARINA BLVD', (37.806047, -122.435333)),
('200 MARINA', (37.806047, -122.435333)),
('100 GROVE', (37.778497, -122.418299)),
('200 MASON', (37.785332, -122.409528)),
('100 JESSIE', (37.788412, -122.400417)),
('547 CLEMENT', (37.782924, -122.465226)),
('700 JOHN MUIR', (37.709942, -122.488090))]
## Transfer data from list to dataframe
for a, gps in fix:
address_gps_exact_fail.loc[a, 'median_lat'] = gps[0]
address_gps_exact_fail.loc[a, 'median_lon'] = gps[1]
## Only keep rows with gps coordinates
address_gps_exact_fail.dropna(inplace = True)
print('Number of rows recovered: {:,}'.format(int(address_gps_exact_fail.no_gps.sum())))
## combine both together
address_gps_exact = pd.concat([address_gps_exact, address_gps_exact_fail])
## No longer needed columns
address_gps_exact.drop(columns=['total', 'has_gps', 'no_gps', 'pct_complete'], inplace=True)
## reset index for merging
address_gps_exact.reset_index(inplace = True)
print('Final lookup table shape: {}'.format(address_gps_exact.shape))
## Plot the imputations
pt = address_gps_exact
gps = gpd.GeoDataFrame(pt[['address']], crs ={'init': 'epsg:4269'},
geometry = [Point(xy) for xy in zip(pt.median_lon, pt.median_lat)])
## the base map (SF)
base = sf.plot(color='white', edgecolor='black', figsize=(30,5))
gps.plot(ax=base, marker='o', color='red', markersize=10)
plt.title('GPS locations inside Exact Lookup table')
plt.show()
## wrap up
del address_gps_exact_fail, fix, min_count_idx, min_compt_idx
del min_tickets, min_complete, a, gps, base
## Take the proportion that is gonna be imputed
to_impute = df_merge.loc[df_merge.gps_missing, ['ticket_number', 'address']].copy().reset_index()
print('N rows matches N missing gps: {}'.format(to_impute.shape[0] == df_merge.gps_missing.sum()))
## Merge with address_gps_exact lookup table
imputed = pd.merge(to_impute, address_gps_exact, how = 'left', on = 'address')
print('# of rows imputed: {:,}'.format(imputed.median_lat.notna().sum()))
## Drop all failed imputation
imputed.dropna(inplace = True)
## Set index for quick replacement of nan values
imputed.set_index('index', inplace = True)
## Verify that index still match with correct ticket
tmp = imputed.sample(n=20, random_state = 50)
print('Imputed index and ticket number match with df_merge: {}'\
.format((df_merge.iloc[tmp.index, df_merge.columns.get_loc('ticket_number')] == \
tmp.ticket_number).all()))
## glimpse at comparison
tmp.head(3)
## Replace missing with imputed table
df_merge.iloc[imputed.index, df_merge.columns.get_loc('hh_gpslat')] = imputed.median_lat
df_merge.iloc[imputed.index, df_merge.columns.get_loc('hh_gpslon')] = imputed.median_lon
## Record which rows were imputed
df_merge.iloc[imputed.index, df_merge.columns.get_loc('imputed')] = True
## update gps_missing
df_merge['gps_missing'] = df_merge.hh_gpslat.isna()
print('Current missing amount: {:,}'.format(df_merge.gps_missing.sum()))
## wrap up
del tmp, to_impute, imputed, address_gps_exact, pt
Standard blocks are usually increment of 100.
## Need to extract street_no because some tickets has
## full address in the street_name column
street_no_extract = df_merge.address.str.partition(' ')
street_no_extract.head(2)
## flooring street_no to the 2nd digit
street_no_block = pd.to_numeric(street_no_extract[0], errors='coerce') // 100 * 100
## Convert street no to string, remove nan and tailing zero '.0'
street_no_block = street_no_block.astype('str').str.replace('nan', '').str.replace('\.0', '')
## combine street block with street name + suffix and remove white space
df_merge['block'] = street_no_block.str.cat(df_merge.street_partial, sep = ' ').str.strip()
## Verify via visual
df_merge[['address', 'block']].sample(n = 5, random_state = 33)
## Create imputation table
address_gps_block = df_merge.groupby('block').apply(sum_street)
## Calculate cannot be imputed due to zero gps coordinate
print('Number of tickets that cannot be imputed: {:,}'\
.format(int(address_gps_block.loc[address_gps_block.pct_complete == 0, 'no_gps'].sum())))
## Extract the ones that cannot be imputed to be manually fixed
address_gps_block_fail = address_gps_block[address_gps_block.pct_complete == 0]
## Keep rows with at least 100
address_gps_block_fail = address_gps_block_fail[address_gps_block_fail.total > 100]
## Sort for easy visual
address_gps_block_fail.sort_values(by = 'total', ascending = False, inplace = True)
print('N rows: {}'.format(address_gps_block_fail.shape[0]))
address_gps_block_fail.to_csv('dataset/address_gps_block_fail.csv')
address_gps_block_fail.head(3)
Most of the ones that cannot be fix are mainly because of ambiguous street name like California which can be street or avenue or E LOT which doesn't say which parking lot it is.
## Remove rows where gps is na which are the 0% and 100% missing address
address_gps_block.dropna(subset=['median_lat', 'median_lon'], inplace=True)
print('Number of address before dropping: {:,}'.format(address_gps_block.shape[0]))
## Keep if count is greater than mininum ticket count or completeness
min_tickets = 30
min_complete = 75
min_count_idx = address_gps_block.has_gps > min_tickets
min_compt_idx = address_gps_block.pct_complete > min_complete
print('Number of address after dropping: {:,}'.format((min_count_idx | min_compt_idx).sum()))
print('===============')
print('Number of tickets affected by drop: {:,}'\
.format(int(address_gps_block.loc[~(min_count_idx | min_compt_idx), 'no_gps'].sum())))
## Drop rows that doesn't meet threshold
address_gps_block = address_gps_block[(min_count_idx | min_compt_idx)]
print('Number of tickets captured: {:,}'\
.format(int(address_gps_block.no_gps.sum())))
Many of are just missing its suffix. However, I decided to fix them via manually searching for coordinates instead of adding in the suffix.
## I chose the middle of the block
## Did not do all of the possible ones
fix = [('700 CLEMENT', (37.782829, -122.467150)),
('200 COLUMBUS', (37.797519, -122.406152)),
('200 MARKET', (37.792908, -122.396852)),
('1600 GENEVA', (37.711715, -122.428041)),
('400 CASTRO', (37.761642, -122.435136))
]
## Transfer data from list to dataframe
for a, gps in fix:
address_gps_block_fail.loc[a, 'median_lat'] = gps[0]
address_gps_block_fail.loc[a, 'median_lon'] = gps[1]
## Only keep rows with gps coordinates
address_gps_block_fail.dropna(inplace = True)
print('Number of rows recovered: {:,}'.format(int(address_gps_block_fail.no_gps.sum())))
## combine both together
address_gps_block = pd.concat([address_gps_block, address_gps_block_fail])
## No longer needed columns
address_gps_block.drop(columns=['total', 'has_gps', 'no_gps', 'pct_complete'], inplace=True)
## reset index for merging
address_gps_block.reset_index(inplace = True)
print('Final lookup table shape: {}'.format(address_gps_block.shape))
## Plot the imputations
pt = address_gps_block
gps = gpd.GeoDataFrame(pt[['block']], crs ={'init': 'epsg:4269'},
geometry = [Point(xy) for xy in zip(pt.median_lon, pt.median_lat)])
## the base map (SF)
base = sf.plot(color='white', edgecolor='black', figsize=(30,5))
gps.plot(ax=base, marker='o', color='red', markersize=10)
plt.title('GPS locations inside Block Lookup table')
plt.show()
## wrap up
del address_gps_block_fail, fix, min_count_idx, min_compt_idx, a, gps
del min_tickets, min_complete, street_no_block, street_no_extract
## Take the proportion that is gonna be imputed
to_impute = df_merge.loc[df_merge.gps_missing, ['ticket_number', 'block']].copy().reset_index()
print('N rows matches N missing gps: {}'.format(to_impute.shape[0] == df_merge.gps_missing.sum()))
## Merge with address_gps_exact lookup table
imputed = pd.merge(to_impute, address_gps_block, how = 'left', on = 'block')
print('# of rows imputed: {:,}'.format(imputed.median_lat.notna().sum()))
## Drop all failed imputation
imputed.dropna(inplace = True)
## Set index for quick replacement of nan values
imputed.set_index('index', inplace = True)
## Verify that index still matches with correct ticket
tmp = imputed.sample(n=20, random_state = 50)
print('Imputed index and ticket number match with df_merge: {}'\
.format((df_merge.iloc[tmp.index, df_merge.columns.get_loc('ticket_number')] == \
tmp.ticket_number).all()))
## glimpse at comparison
tmp.head(3)
## Replace missing with imputed table
df_merge.iloc[imputed.index, df_merge.columns.get_loc('hh_gpslat')] = imputed.median_lat
df_merge.iloc[imputed.index, df_merge.columns.get_loc('hh_gpslon')] = imputed.median_lon
## Record which rows were imputed
df_merge.iloc[imputed.index, df_merge.columns.get_loc('imputed')] = True
## update gps_missing
df_merge['gps_missing'] = df_merge.hh_gpslat.isna()
print('Current missing amount: {:,}'.format(df_merge.gps_missing.sum()))
## wrap up
del tmp, to_impute, imputed, address_gps_block
## Ploting all gps points
sfplot(df_merge)
This method was place at the end because it makes changes on the whole dataset and without a robust way to confirm that it works without any bugs and will not take a long time to complete. I will limit to the last bit which is only very small proportion. Plus streets with clearly wrong suffix will stay missing therefore, the position of this method doesn't matter. I was not able to make this work because it was hard fixing street with common words like MISSION ROCK and MISSION ST.
## Find Street with more than one suffix
street_more_suffix = off_street_names.groupby(['StreetName']).size()\
.to_frame(name = 'n_suffix').reset_index()
street_more_suffix = street_more_suffix[street_more_suffix.n_suffix > 1]
print(street_more_suffix.shape)
## Extract the ones that are still missing
## And using block to impute
to_impute = df_merge.loc[df_merge.gps_missing, ['ticket_number', 'address', 'block']]\
.copy().reset_index()
print('N rows matches N missing gps: {}'.format(to_impute.shape[0] == df_merge.gps_missing.sum()))
## Remove rows that clearly cannot be fixed
## Keep full address and cross streets
to_impute = to_impute[to_impute.block.str.contains(' AND ') |
to_impute.block.str.contains('^\d+ ', na=False)].copy()
print(to_impute.shape)
print('Before: {:,}'.format(to_impute.shape[0]))
## Remove streets with multiple possible suffix
for i, street in street_more_suffix.StreetName.iteritems():
## must match whole street
match = to_impute.address.str.contains('\\b'+street+'\\b', na=False)
if match.any():
to_impute = to_impute[~match]
print('After drop: {:,}'.format(to_impute.shape[0]))
## Obtain top # of blocks to geoencode
top = 2400
block_count = to_impute.block.value_counts(ascending=False)
top_block = block_count.index[:top]
print('Number of rows captured: {:,}'.format(block_count[:top].sum()))
# ## Save geocode result
# geo_cache = pd.DataFrame({'lat': 0.0, 'lon': 0.0,
# 'confidence': 0, 'status': 'none'},
# index = top_block)
# ## Obtain lat and lon with google geocoder api
# ## by looping over the collected address
# for address in geo_cache.index.tolist()[357:]:
# ## You need an api key or else you will get OVER_QUERY_LIMIT
# ## on the first query
# g = geocoder.google(address + ', San Francisco, CA',
# key = 'YOUR-OWN')
# if g.status == 'OVER_QUERY_LIMIT':
# print('OVER_QUERY_LIMIT')
# break
# ## skip if not okay i.e ZERO_RESULTS
# if g.status != 'OK':
# geo_cache.at[address, 'status'] = g.status
# continue
# geo_cache.at[address, 'lat'] = g.latlng[0]
# geo_cache.at[address, 'lon'] = g.latlng[1]
# geo_cache.at[address, 'confidence'] = g.confidence
# geo_cache.at[address, 'status'] = g.status
## Save result so I don't need to use geocode again.
# geo_cache.to_csv('dataset/geo_cache_new.csv')
## read in previously geocoded address
geo_cache = pd.read_csv('dataset/geo_cache_final.csv', index_col = 0)
## Remove fail searches
geo_cache = geo_cache[geo_cache.status == 'OK']
geo_cache.index.name = 'block'
geo_cache.reset_index(inplace = True)
print(geo_cache.shape)
geo_cache.head(2)
## Determine geocoded points that are outside of boundary
tmp = geo_cache.reset_index()
gps = gpd.GeoDataFrame(tmp[['index','block']], crs ={'init': 'epsg:4269'},
geometry = [Point(xy) for xy in zip(tmp.lon, tmp.lat)])
## Determine if gps coordinates are inside polygon
inOut = sjoin(gps, sf[['NAME', 'geometry']], how = 'left')
print('Number of rows outside of SF: {}'.format(inOut.NAME.isna().sum()))
base = sf.plot(color = 'white', edgecolor = 'black', figsize=(30,5))
## Blue if inside
inOut[inOut.NAME.notna()].plot(ax=base, marker='o', color='blue',markersize=10)
## Red if outside
inOut[inOut.NAME.isna()].plot(ax=base, marker='o', color='red',markersize=10)
plt.title('Geocode Points Inside vs Outside')
plt.show()
## set index
inOut.set_index('index', inplace=True)
## find points outside of SF
outside = inOut[inOut.NAME.isna()]
outside
print('Before: {}'.format(geo_cache.shape[0]))
## Drop the bad points
geo_cache.drop(index=outside.index, inplace=True)
print('After: {}'.format(geo_cache.shape[0]))
## Merge with address_gps_exact lookup table
imputed = pd.merge(to_impute, geo_cache, how = 'left', on = 'block')
print('# of rows imputed: {:,}'.format(imputed.lat.notna().sum()))
## Drop all failed imputation
imputed.dropna(inplace = True)
## Set index for quick replacement of nan values
imputed.set_index('index', inplace = True)
## Verify that index still matches with correct ticket
tmp = imputed.sample(n=20, random_state = 50)
print('Imputed index and ticket number match with df_merge: {}'\
.format((df_merge.iloc[tmp.index, df_merge.columns.get_loc('ticket_number')] == \
tmp.ticket_number).all()))
## glimpse at comparison
tmp.head(3)
## Replace missing with imputed table
df_merge.iloc[imputed.index, df_merge.columns.get_loc('hh_gpslat')] = imputed.lat
df_merge.iloc[imputed.index, df_merge.columns.get_loc('hh_gpslon')] = imputed.lon
## Record which rows were imputed
df_merge.iloc[imputed.index, df_merge.columns.get_loc('imputed')] = True
## update gps_missing
df_merge['gps_missing'] = df_merge.hh_gpslat.isna()
print('Current missing amount: {:,}'.format(df_merge.gps_missing.sum()))
## wrap up
del off_street_names, street_more_suffix, to_impute, block_count, top, top_block, imputed
del geo_cache, i, street, match, base, pt, inOut, gps, outside
## Check to see if data was overwritten
## gps that weren't missing should not be imputed
## should be false
print('There are gps data accidently overwritten: {}'\
.format(((df_merge.gps_missing_record != df_merge.imputed) != df_merge.gps_missing).any()))
sfplot(df_merge)
## Obtain three random streets with atleast 500 non-imputed and imputed rows
tmp = df_merge.groupby('tick_street_name').agg({'imputed': 'sum', 'gps_missing_record': 'sum'})
tmp[(tmp.imputed > 500) & (tmp.gps_missing_record > 500)].sample(n = 3, random_state = 42)
## Export for tableau visualization
# df_merge.loc[df_merge.tick_street_name.isin(['MISSION', 'LINCOLN WAY', '43RD', 'NATOMA']),
# ['tick_street_name', 'hh_gpslat', 'hh_gpslon', 'imputed']]\
# .to_csv('dataset/tableau/df_merge_impute.csv')
## Display original data first
Image(filename = os.path.join('.','image','imputed_3_not.png'),width = 500, height = 100)
## Manually visuallize whether impute looks fine
Image(filename = os.path.join('.','image','imputed_3.png'),width = 500, height = 100)
## Manually visuallize whether impute looks fine
## Display original data first
Image(filename = os.path.join('.','image','imputed_mission.png'),width = 500, height = 100)
Conclusion: After checking 3 random and the street with most tickets, I conclude that imputation worked just fine.
## make sure there weren't any duplicates
print('N dupluciates? {}'\
.format(df_merge.duplicated(subset='ticket_number', keep=False).sum()))
## drop columns that aren't usefull anymore
df_merge.drop(columns = ['_merge', 'tick_street_no',
'tick_street_suffix', 'gps_missing',
'street_partial', 'block',
'violation_code', 'violation',
'violation_desc_long'], inplace = True)
print(df_merge.shape)
## reorder columns in a more meaningful way
df_merge = df_merge.reindex(copy = False,
columns = ['issue_datetime', 'ticket_number',
'tick_street_name','address', 'hh_gpslat',
'hh_gpslon', 'violation_desc_general',
'violation_group','tick_corr_type',
'tick_corr_type_desc_long', 'handheld',
'badge', 'state_full', 'plate', 'tick_vin',
'tick_rp_vin', 'gps_missing_record',
'imputed', 'veh_body'])
df_merge.info(null_counts=True)
## Export for the team
df_merge.to_csv('dataset/df_merge_cleaned_sm.csv')
# ## export for me
df_merge.to_pickle('dataset/df_merge-sm.pickle', compression='gzip')
## import backup
# df_merge = pd.read_pickle('dataset/df_merge.pickle')
TODO List:
Questions:
Random insight: